A model to predict whether a comment or sentence written on twitter or reddit is positive , negative or neutral .
import numpy as np
import pandas as pd
import os
#For Preprocessing
import re
import nltk
# nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
import seaborn as sns
#For data visualization
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.plotting.backend = "plotly"
import warnings
warnings.filterwarnings("ignore")
#set seed for productivity of the project
SEED = 2021
def seed(SEED=SEED):
np.random.seed = SEED
os.environ['PYTHONHASHSEED'] = str(SEED)
seed()
# read the dataset
twitter = pd.read_csv('twitter.csv')
reddit = pd.read_csv("reddit.csv")
# preview both
print("Twitter head")
display(twitter.head())
print()
print("Reddit head")
reddit.head()
Twitter head
| clean_text | category | |
|---|---|---|
| 0 | when modi promised “minimum government maximum... | -1.0 |
| 1 | talk all the nonsense and continue all the dra... | 0.0 |
| 2 | what did just say vote for modi welcome bjp t... | 1.0 |
| 3 | asking his supporters prefix chowkidar their n... | 1.0 |
| 4 | answer who among these the most powerful world... | 1.0 |
Reddit head
| clean_comment | category | |
|---|---|---|
| 0 | family mormon have never tried explain them t... | 1 |
| 1 | buddhism has very much lot compatible with chr... | 1 |
| 2 | seriously don say thing first all they won get... | -1 |
| 3 | what you have learned yours and only yours wha... | 0 |
| 4 | for your own benefit you may want read living ... | 1 |
# check for nulls
twitter.isna().sum() , reddit.isna().sum()
(clean_text 4 category 7 dtype: int64, clean_comment 100 category 0 dtype: int64)
# lets drop the nulls in both dataset
twitter.dropna(axis=0, inplace=True)
reddit.dropna(axis=0, inplace=True)
# recheck if there are any other nulls
reddit.isna().sum() , twitter.isna().sum()
(clean_comment 0 category 0 dtype: int64, clean_text 0 category 0 dtype: int64)
# check category unique values
print(f"Unique in twitter are {twitter['category'].unique()}")
print(f"Unique in reddit are {reddit['category'].unique()}")
Unique in twitter are [-1. 0. 1.] Unique in reddit are [ 1 -1 0]
category column has 3 values:
* 0 for a neutral sentiment
* 1 for a postive sentiment
* -1 for a negative sentiment
# check the size of the two dataset
twitter.shape , reddit.shape
((162969, 2), (37149, 2))
# rename the clean_text and clean_comment to text
twitter = twitter.rename(columns = {"clean_text":"text"})
reddit = reddit.rename(columns = {"clean_comment":"text"})
# preview the data
twitter.head()
| text | category | |
|---|---|---|
| 0 | when modi promised “minimum government maximum... | -1.0 |
| 1 | talk all the nonsense and continue all the dra... | 0.0 |
| 2 | what did just say vote for modi welcome bjp t... | 1.0 |
| 3 | asking his supporters prefix chowkidar their n... | 1.0 |
| 4 | answer who among these the most powerful world... | 1.0 |
reddit.tail()
| text | category | |
|---|---|---|
| 37244 | jesus | 0 |
| 37245 | kya bhai pure saal chutiya banaya modi aur jab... | 1 |
| 37246 | downvote karna tha par upvote hogaya | 0 |
| 37247 | haha nice | 1 |
| 37248 | facebook itself now working bjp’ cell | 0 |
# add the two dataset together to get one merged dataset
df = pd.concat([twitter , reddit] , axis = 0).reset_index(drop =True)
# shuffle the dataset to make it mixed for both twitter and reddit
from sklearn.utils import shuffle
df = shuffle(df , random_state=12345).reset_index(drop=True)
# preview the new dataset
df.tail()
| text | category | |
|---|---|---|
| 200113 | went the spot where dia dropped there pay load... | -1.0 |
| 200114 | \npublic rally meerut police officials who are... | 0.0 |
| 200115 | kovind modi greet people rajasthan day via ind... | 0.0 |
| 200116 | please pronto | 0.0 |
| 200117 | kitne sharam baat only criticize some people a... | -1.0 |
# rename the category column to their respective values
# create a mapper
mapper = {
-1:"negative",
0:"neutral",
1:"positive",
}
df['category'] = df['category'].map(mapper)
df.head()
| text | category | |
|---|---|---|
| 0 | proud drdo superlative work and kudos modi gov... | positive |
| 1 | this was once said tamilaruvi maniyan too diff... | positive |
| 2 | its not modi but people like you who call them... | neutral |
| 3 | this also the responsibility our modi | neutral |
| 4 | modi doing job delhi and specially for you wil... | positive |
df.category.unique()
array(['positive', 'neutral', 'negative'], dtype=object)
# check the distributions of the category column using a plot
plt.figure(figsize=(9,8))
sns.countplot(df.category)
plt.title("Sentiment Distribution on reddit and Twitter data" , c ="r" , fontsize =14)
plt.savefig("label.png")
# check the percentage for each sentiment
df.category.value_counts(normalize=1)
positive 0.440135 neutral 0.341064 negative 0.218801 Name: category, dtype: float64
import plotly.express as px
fig = px.pie(df, names='category', title = 'Percentage for each sentiment in pie chart')
fig.show()
# remove stopwords from the text column
stopwords = nltk.corpus.stopwords.words('english')
df.head()
| text | category | |
|---|---|---|
| 0 | proud drdo superlative work and kudos modi gov... | positive |
| 1 | this was once said tamilaruvi maniyan too diff... | positive |
| 2 | its not modi but people like you who call them... | neutral |
| 3 | this also the responsibility our modi | neutral |
| 4 | modi doing job delhi and specially for you wil... | positive |
def sentLengthPlot(sentiment , df , color , plot_name):
fig = plt.figure(figsize=(10,7))
df['length'] = df["text"].str.split().apply(len)
ax1 = fig.add_subplot(122)
sns.histplot(df[df['category']==sentiment]['length'], ax=ax1 , color =color)
describe = df.length[df.category==sentiment].describe().to_frame().round(2)
ax2 = fig.add_subplot(121)
ax2.axis('off')
font_size = 14
bbox = [0, 0, 1, 1]
table = ax2.table(cellText = describe.values, rowLabels = describe.index, bbox=bbox, colLabels=describe.columns)
table.set_fontsize(font_size)
fig.suptitle(f'Distribution of text length for {sentiment} sentiment texts.', fontsize=14 , c ='r')
plt.savefig(plot_name)
plt.show()
# for posive sentiment
sentLengthPlot("positive" , df , "green" , "positive.png")
# for negative sentiment
sentLengthPlot("negative" , df , 'red' , "negative.png")
# for neutral sentiment
sentLengthPlot('neutral' , df ,'blue' , "neutal.png")
#### check common words using wordcloud
from wordcloud import WordCloud, STOPWORDS
def wordcount_gen(df, category):
"""
Function to get word clouds
"""
# get all texts for the category together
all_cat_text = " ".join([txt for txt in df[df.category==category]['text']])
# Initialize wordcloud object
wc = WordCloud(background_color='white',
max_words=50,
stopwords = STOPWORDS)
# Generate and plot wordcloud
plt.figure(figsize=(10,10))
plt.imshow(wc.generate(all_cat_text))
plt.title(f'{category} Sentiment Words', fontsize=14 , c="r")
plt.axis('off')
plt.savefig(f"{category}Wordcloud.png")
plt.show()
# for Positive texts
wordcount_gen(df, 'positive')
# for negative words
wordcount_gen(df, 'negative')
# for neutral words
wordcount_gen(df, 'neutral')
def CleanAndProcess(text):
"""
This function removes any stopwords in the text
It also removes punctuations
and lastly it stemmize the word into its root form
"""
# convert to lowercase
text = text.lower()
# remove non letters
text = re.sub(r"[^a-zA-Z0-9]", " ", text)
# tokenize the text into words
words = text.split()
# remove stopwords
words = [w for w in words if w not in stopwords]
# apply stemming
words = [PorterStemmer().stem(w) for w in words]
# return a new sentense with the applied functions
return " ".join(words)
# check sample
print("Original text : ", df['text'][100])
print("\nProcessed text : ", CleanAndProcess(df['text'][100]))
Original text : cpim will decommunilize the textbooks modi mamata will featured syllebus Processed text : cpim decommunil textbook modi mamata featur syllebu
# apply the function for processing and get the X features
X = df['text'].apply(CleanAndProcess)
# counter for most common
from collections import Counter
def counter(comment_clear):
cnt = Counter()
for words in comment_clear:
for word in words.split():
cnt[word] += 1
return cnt
# /apply the count function
counts = counter(X)
# plot the top 20 common words
top_20_words = []
top_20_count =[]
for word , count in counts.most_common(20):
top_20_words.append(word)
top_20_count.append(count)
plt.figure(figsize=(10 ,9))
plt.title("20 Most common words in the dataset " , c ="r" , fontsize=14)
sns.barplot(top_20_words , top_20_count)
plt.savefig("20mostcommon.png")
from sklearn.preprocessing import LabelEncoder
# re - encode the categories to numerical
encoder = LabelEncoder()
encoder.fit(df['category'])
label = encoder.transform(df['category'])
# split the text to get training and testing datasets
# split into train and test with 85% train and 15% test
X_train, X_test, y_train, y_test = train_test_split(
X, label,
test_size=0.15,
random_state=SEED)
# check the new sizes
X_train.shape , y_train.shape , X_test.shape , y_test.shape
((170100,), (170100,), (30018,), (30018,))
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# define vectorizers
# count vectorizer
countVect = CountVectorizer()
# frequency vectorizer.
tfidfVect = TfidfVectorizer()
# fit count vectorizer
countVect.fit(X)
# fit tdifvector
tfidfVect.fit(X)
TfidfVectorizer()
# transform the training to vects
X_train_count = countVect.transform(X_train)
X_train_tdfif = tfidfVect.transform(X_train)
# transform the testing data
X_test_tdfif = tfidfVect.transform(X_test)
X_test_count = countVect.transform(X_test)
from sklearn.naive_bayes import MultinomialNB , GaussianNB
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBRFClassifier
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
# help(CatBoostClassifier)
# # test an ensembel model..
# # We will use Gradientboost
# model = GradientBoostingClassifier()
# # train
# model.fit(X_train_count , y_train)
# # get score
# model.score(X_test_count , y_test)
# # this model takes alot of time to run..
# # the score for the model is 0.7290625624625224
# define logistic models
logit_tdif = LogisticRegression()
logit_count = LogisticRegression()
# define multinomial model
nb_count = MultinomialNB()
nb_tdif = MultinomialNB()
# train and get score for each.
logit_count.fit(X_train_count , y_train)
print(f"COUNT Vectorizer on Logistic reg score is {logit_count.score(X_test_count , y_test)}")
# train and get score for each.
logit_tdif.fit(X_train_tdfif , y_train)
print(f"TDIF Vectorizer on Logistic reg score is {logit_tdif.score(X_test_tdfif , y_test)}")
# train and get score for each.
nb_count.fit(X_train_count , y_train)
print(f"COUNT Vectorizer on MultinomialNB score is {nb_count.score(X_test_count , y_test)}")
# train and get score for each.
nb_tdif.fit(X_train_tdfif , y_train)
print(f"TDIF Vectorizer on MultinomialNB score is {nb_tdif.score(X_test_tdfif , y_test)}")
COUNT Vectorizer on Logistic reg score is 0.8586847891265241 TDIF Vectorizer on Logistic reg score is 0.8506895862482511 COUNT Vectorizer on MultinomialNB score is 0.6994803118129123 TDIF Vectorizer on MultinomialNB score is 0.5671597041774935
# define the parameters to be tuned
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
# create a cross fold validation.
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=SEED)
# grid params for Logistic
Log_grid_param = {
"solver":["liblinear"],
"C":[.5 , .7],
"max_iter":[1000 , 1500]
}
# initialized
grid_search = GridSearchCV(
LogisticRegression(),
Log_grid_param,
n_jobs=-1,
cv=cv,
)
############################START TUNE PARAMETERS####################
# transform the X features
X_features = countVect.transform(X)
# fit the data on X features
grid_result = grid_search.fit(X_features, label)
# get the model summary results
print(f"Best score is : {grid_result.best_score_}")
# print the parameters for best score
print("\n")
print("best params")
print(grid_result.best_params_)
Best score is : 0.8543142879034702
best params
{'C': 0.7, 'max_iter': 1000, 'solver': 'liblinear'}
# get some score deviation and mean statistics
mean_score = grid_result.cv_results_['mean_test_score']
std_score = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, std_dev, param in zip(mean_score, std_score, params):
print(f"{mean} {std_dev} with: {param}")
0.853319874607315 0.0016664566112692145 with: {'C': 0.5, 'max_iter': 1000, 'solver': 'liblinear'}
0.853319874607315 0.0016664566112692145 with: {'C': 0.5, 'max_iter': 1500, 'solver': 'liblinear'}
0.8543142879034702 0.0016088402804799772 with: {'C': 0.7, 'max_iter': 1000, 'solver': 'liblinear'}
0.8543142879034702 0.0016088402804799772 with: {'C': 0.7, 'max_iter': 1500, 'solver': 'liblinear'}
#################### END TUNE PARAMETERS######################
# get the best parameters from the model
# this will be used for new training
best_parameter = {'C': 0.7, 'max_iter': 1000, 'solver': 'liblinear'}
# use the final model with the best parameters
final_model = LogisticRegression(**best_parameter , random_state=SEED)
# fit the model
final_model.fit(X_train_count , y_train)
#get the predictions
preds = final_model.predict(X_test_count)
# get various scores of the predicted results
from sklearn.metrics import accuracy_score , precision_score , recall_score , confusion_matrix ,f1_score
print(f"Recall score for the model is {recall_score(y_test , preds , average='micro')}")
print(f"Precision score for the model is {precision_score(y_test , preds , average='micro')}")
print(f"Accuracy score for the model is {accuracy_score(y_test , preds)}")
print(f"F-1 Score for the model is {f1_score(y_test , preds , average = 'micro')}")
Recall score for the model is 0.8627157039109867 Precision score for the model is 0.8627157039109867 Accuracy score for the model is 0.8627157039109867 F-1 Score for the model is 0.8627157039109867
# plot matrix for the score
plt.figure(figsize=(10,10))
"""
A heat map matrix showing actual and predicted values for each class
"""
# set the color pallet
sns.color_palette("bright")
sns.heatmap(
confusion_matrix(y_test , preds),
cbar = False,
annot=True,
fmt=""
)
plt.xlabel("Predicted Category")
plt.ylabel("Actual Category")
plt.title('Actual vs Predicted category report ')
plt.savefig("cmap.png")